Load libraries:
library(ggplot2)
library(readr)
library(RColorBrewer)
library(reshape2)
library(dplyr)
library(plotly)
library(scales)
Read the dataset:
df <- read_csv("data/gun-violence-data_01-2013_03-2018.csv")
There are a few columns we don’t care about - drop those
gun <- select(df, -c(incident_url, source_url, incident_url_fields_missing, congressional_district, sources, state_house_district, state_senate_district))
How many gun casualties (defined as number killed and injured) were there for each year?
gun %>% group_by(Year = format(date, "%Y")) %>%
summarize(n = sum(n_injured + n_killed)) %>%
ggplot(aes(x = Year, y = n)) + geom_col()
Clearly there are missing data from 2013, and 2018 isn’t complete yet. Let’s drop these two years. Let’s also separate the suicides.
gun <- filter(gun, format(date, "%Y") != 2018 & format(date, "%Y") != 2013)
suicide <- filter(gun, grepl("\\|\\|Suicide\\^", incident_characteristics)
& n_killed + n_injured < 2)
gun <- filter(gun, !(incident_id %in% suicide$incident_id))
How many gun casualties were there in each state for 2014-2017?
cols <- brewer.pal(3, "Dark2")
gun %>% group_by(state) %>%
summarize(n_casualty = sum(n_killed + n_injured)) %>%
ggplot(aes(x = reorder(state, -n_casualty), y = n_casualty)) +
geom_col(fill = cols[1]) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size = 14)) +
theme(axis.title.y = element_text(size = 16)) +
theme(plot.title = element_text(size = 16)) +
labs(x = NULL, y = "Casualties (Injuries + Fatalities)",
title = "Number of Gun Casualties in the US by State for 2014-2017")
The top 5 states with the highest casualty rate are:
However, these are also pretty populated states. What we really want to know is the per capita rate, since population is an important factor. I got census data for each state from 2010-2017 from the U.S. Census Bureau, and then calculated the per capita rates (per hundred thousand people) for each year:
# READ IN US POPULATION DATA BY STATE
colNames <- c("id1", "id2", "state", "census2010", "cenbase2010", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017")
pop <- read_csv("data/US\ Population\ by\ State/PEP_2017_PEPANNRES_with_ann.csv", skip = 2, col_names = colNames)
# Only keep the columns and rows we want
pop <- select(pop, id1, state, "2014", "2015", "2016", "2017") %>% slice(6:n())
# Casualties by state and year; I removed Washington DC from the state rankings, since it's technically a city, not a state
casByState <- group_by(gun, state, year = format(date, "%Y")) %>% filter(state != "District of Columbia") %>%
summarize(n.casualty = sum(n_killed + n_injured), n.injured = sum(n_injured), n.killed = sum(n_killed))
# Add the population and the per capita rate (per 100K people)
# First, generate the population vector:
popList <- numeric()
for (i in 1:nrow(casByState)) {
popList <- c(popList, as.numeric(pop[pop$state == casByState$state[i], casByState$year[i]]))
}
casByState <- ungroup(casByState) %>% mutate(population = popList, cas.per.capita100K = n.casualty/population*100000, k.per.capita100K = n.killed/population*100000, i.per.capita100K = n.injured/population*100000)
Now that we have the per-state data for each year, let’s take a look at it:
## # A tibble: 200 x 9
## state year n.casualty n.injured n.killed population cas.per.capita10…
## <chr> <chr> <int> <int> <int> <dbl> <dbl>
## 1 Alaba… 2014 914 591 323 4840037. 18.9
## 2 Alaba… 2015 937 562 375 4850858. 19.3
## 3 Alaba… 2016 1234 761 473 4860545. 25.4
## 4 Alaba… 2017 1387 856 531 4874747. 28.5
## 5 Alaska 2014 70 49 21 736759. 9.50
## 6 Alaska 2015 143 84 59 737979. 19.4
## 7 Alaska 2016 170 103 67 741522. 22.9
## 8 Alaska 2017 125 70 55 739795. 16.9
## 9 Arizo… 2014 435 222 213 6706435. 6.49
## 10 Arizo… 2015 397 193 204 6802262. 5.84
## # ... with 190 more rows, and 2 more variables: k.per.capita100K <dbl>,
## # i.per.capita100K <dbl>
To make plotting a little simpler, let’s take the mean of each per capita rate for each state across each year. So, for each state there will be four entries: 2014, 2015, 2016, and 2017.
meanCBS <- group_by(casByState, state) %>% summarize(meanCasRate = mean(cas.per.capita100K), meanKRate = mean(k.per.capita100K), meanIRate = mean(i.per.capita100K))
Now let’s plot it and see:
ggplot(data = meanCBS, aes(x = reorder(state, -meanCasRate), y = meanCasRate)) +
geom_col(fill = cols[2]) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 14)) +
theme(axis.title.y = element_text(size = 16)) +
theme(plot.title = element_text(size = 16)) +
labs(x = NULL, y = "Mean Casualty Rate per 100,000 People",
title = "Mean Casualty Rate per 100,000 People by State (2014-2017)") +
geom_hline(yintercept = median(meanCBS$meanCasRate), lwd = 1)
The black line represents the median value for all 50 states. The rankings have changed significantly. The top 5 states with the highest per capita casualty rate are:
The only state that remained in the top 5 after adjusting for population was Illinois, and that dropped one level.
Let’s now look at the cities with the highest rates of per capita gun violence. I again used data from the US Census Bureau to find the population of cities in the United States. The dataset I used contained data from 2010-2016 for all cities greater than 50,000 residents. I had to do some manipulation of the city names in order to make them match the cities in the original dataset. Also, since there are no data for 2017, I just used the data for 2016 and assumed there wouldn’t be any significant changes.
# READ IN US POPULATION DATA BY CITY
colNames <- c("id1", "id2", "country", "tid1", "tid2", "rank", "geography1", "city", "census2010", "cenbase2010", "2010", "2011", "2012", "2013", "2014", "2015", "2016")
citypop <- read_csv("data/US\ Population\ by\ City/PEP_2016_PEPANNRSIP.US12A_with_ann.csv", skip = 2, col_names = colNames)
# Only keep the columns and rows we want (drop all the "government" and "county" entries)
citypop <- select(citypop, tid2, city, "2014", "2015", "2016") %>% filter(!grepl("[Cc]ounty|government", city))
# Reformatting city names
cityState <- sub("^(.+) (city( \\(balance\\))?|municipality|village|town), (.+)", "\\1, \\4", citypop$city) %>% strsplit(", ")
# split up the city column into city and state columns
cityState <- as.data.frame(matrix(unlist(cityState), nrow = length(cityState), byrow = TRUE), stringsAsFactors = FALSE)
names(cityState) <- c("city", "state")
# Ventura, CA has a nonstandard entry; fix it
cityState$city[grep("\\(", cityState$city)] <- "Ventura"
# In the original gun violence data, the cities listed for New York are split into the five boroughs. So, for example, Brooklyn and The Bronx are separate entries, when they need to be considered "New York" for our purposes. Let's fix that manually.
ny <- filter(gun, state == "New York")
nycInd <- grep("Manhattan|Queens|Bronx|Staten Island|Brooklyn|New York City", ny$city_or_county)
nyc <- ny[nycInd,] %>% filter(city_or_county != "Queensbury")
# Now go through the gun dataframe and change all relevant city_or_state entries to "New York"
for(i in 1:nrow(nyc)) {
incident <- nyc$incident_id[i]
index <- which(gun$incident_id == incident)
gun[index,"city_or_county"] <- "New York"
}
# Integrate the new columns back to the tibble and rearrange
# Also, since we are missing data for 2017, just reuse the 2016 column
citypop <- mutate(citypop, city = cityState$city, state = cityState$state, "2017" = citypop$`2016`) %>% select(tid2, city, state, "2014", "2015", "2016", "2017")
Get the data by city for each year. Note: some years will be missing because those cities did not have any reported gun crimes in those years. Although this will affect the mean values for cities without any reported gun crimes in some years, we are only going to look at the top 50 cities. For these cities to be in the top 50, it stands to reason that there will be crimes from all four years.
casByCity <- gun %>% group_by(city_or_county, state, year = format(date, "%Y")) %>% summarize(n.casualty = sum(n_killed + n_injured), n.killed = sum(n_killed), n.injured = sum(n_injured))
casByCity
## # A tibble: 30,952 x 6
## # Groups: city_or_county, state [?]
## city_or_county state year n.casualty n.killed n.injured
## <chr> <chr> <chr> <int> <int> <int>
## 1 Abbeville Alabama 2016 1 0 1
## 2 Abbeville Alabama 2017 1 1 0
## 3 Abbeville Georgia 2017 0 0 0
## 4 Abbeville Louisiana 2014 5 2 3
## 5 Abbeville Louisiana 2015 5 0 5
## 6 Abbeville Louisiana 2016 4 1 3
## 7 Abbeville Louisiana 2017 4 1 3
## 8 Abbeville South Carolina 2014 1 0 1
## 9 Abbeville South Carolina 2016 1 0 1
## 10 Abbeville South Carolina 2017 4 1 3
## # ... with 30,942 more rows
Now, we need a per capita estimate for each city. First we need to generate a vector of populations: figure out if the city in our data frame is actually in the population dataset, and if it is, grab the population for that particular year. Again, the 2017 population values are the same as the 2016 ones.
# Get a per capita (100K) estimate for each city
# First, generate the population vector:
popList <- numeric()
for (i in 1:nrow(casByCity)) {
theCity <- casByCity$city_or_county[i]
theState <- casByCity$state[i]
theYear <- casByCity$year[i]
ind <- which(citypop$city == theCity)
if (length(ind) > 0 && theCity == citypop$city[ind] && theState == citypop$state[ind]) {
popList <- c(popList, as.numeric(citypop[citypop$state ==
theState & citypop$city == theCity, theYear]))
} else {
popList <- c(popList, NA)
}
}
We’re also going to need the two letter abbreviations for each state for labels later, so let’s grab those and put them in the data frame as well. Don’t forget Washington, D.C.!:
# Get the short name for each state and attach it to the tibble
data(state)
abbs <- data.frame("abb" = state.abb, "state" = state.name, stringsAsFactors = FALSE)
abbs <- rbind(abbs, c("DC", "District of Columbia"), stringsAsFactors = FALSE)
abbList <- unlist(lapply(casByCity$state, function(x) abbs[which(abbs$state == x),]$abb))
Add the population and abbrevation columns, and calculate the mean values for each each state:
meanCBC <- ungroup(casByCity) %>% mutate(abb = abbList, population = popList, cas.per.capita100K = n.casualty/population*100000, k.per.capita100K = n.killed/population*100000, i.per.capita100K = n.injured/population*100000)
# Remove NA values and then compute the mean on the remaining values; sort by mean
meanCBC <- group_by(meanCBC, city_or_county, state, abb) %>% filter(!is.na(population)) %>% summarize(meanCasRate = mean(cas.per.capita100K), meanKRate = mean(k.per.capita100K), meanIRate = mean(i.per.capita100K)) %>% arrange(desc(meanCasRate))
meanCBC
## # A tibble: 700 x 6
## # Groups: city_or_county, state [700]
## city_or_county state abb meanCasRate meanKRate meanIRate
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Trenton New Jersey NJ 160. 23.7 137.
## 2 New Orleans Louisiana LA 156. 41.6 114.
## 3 Gary Indiana IN 151. 50.3 100.
## 4 Baltimore Maryland MD 133. 40.4 92.6
## 5 Flint Michigan MI 132. 39.8 92.0
## 6 Savannah Georgia GA 110. 25.3 84.9
## 7 Chicago Illinois IL 110. 18.4 91.6
## 8 Birmingham Alabama AL 108. 39.1 69.2
## 9 Jackson Mississippi MS 107. 32.0 75.4
## 10 Dayton Ohio OH 105. 23.1 82.3
## # ... with 690 more rows
Let’s plot the top 50:
top50 <- meanCBC[1:50,]
top50 <- arrange(top50, meanCasRate)
top50 <- mutate(top50, meanIRate = -meanIRate)
top50melt2 <- melt(top50[,c("city_or_county", "abb", "meanKRate", "meanIRate")], id.vars = c("city_or_county", "abb"))
ggplot(data = top50melt2) + geom_col(aes(x = city_or_county, y = value, fill = variable)) +
scale_y_continuous(limits = c(-150,150), breaks = c(150, 100, 50, 0, -50, -100, -150), labels = c("150", "100", "50", "0", "50", "100", "150")) +
theme(axis.text.x = element_text(hjust = .95, size = 12)) +
theme(axis.text.y = element_text(vjust = 0.5, size = 10)) +
theme(axis.title.y = element_text(size = 14)) +
theme(plot.title = element_text(size = 16)) +
theme(legend.title = element_blank()) +
theme(legend.position = "bottom") +
theme(legend.text = element_text(size = 14)) +
scale_fill_discrete(labels = c("Fatalities", "Injuries"), guide = guide_legend(reverse = TRUE)) +
labs(x = NULL, y = NULL) +
labs(title = "Mean Injury/Fatality Rate per 100,000 People by City (2014-2017 where available)") +
scale_x_discrete(limits = top50$city_or_county, labels = paste(top50$city_or_county, sep = ", ", top50$abb)) +
coord_flip()
According to the data, the top 5 cities with the most gun violence are:
Chicago comes in at 7th, even though Illinois was the second highest per capita state.
One topic that is furiously being debated across the nation is whether or not to pass more/stricter gun laws on top of the ones that already exist. What is the effect of gun laws on casualty rate? Let’s analyze this both at the state level and city level.
First, read the data for the state laws into R. The data were downloaded from the State Firearm Law Database (http://www.statefirearmlaws.org/). I had to reformat the files by re-saving into Excel because they weren’t actually saved as a CSV, so it wouldn’t have been easy to read it into R. You can download the reformatted CSV files from Github.
lawsdf <- read_csv("data/Laws/48_states_1991_data.csv")
lawsdf <- rbind(lawsdf, read_csv("data/Laws/Hawaii_1991_data.csv"))
lawsdf <- rbind(lawsdf, read_csv("data/Laws/Alaska_1991_data.csv"))
# Rename one of the columns to something easy to reference
lawsdf <- rename(lawsdf, law = `Law? 0/1`)
laws.by.year <- group_by(lawsdf, State, Year) %>% summarize(laws = sum(law))
# Grab the years we are interested in, 2014-2017, and take the average number of laws for the period
mean.laws.by.year <- filter(laws.by.year, Year >= 2014) %>% summarize(meanlaw = mean(laws))
For a first pass, all we are looking at is the absolute number of laws on the books in each state. For simplicity, we assume that each law is as effective as the next (although that is probably not quite true, and each law’s effectiveness probably varies across each state/municipality and has a number of confounding variables). Let’s look at the distribution of the number of laws on the books for all 50 states:
# Histogram of number of laws
ggplot(data = mean.laws.by.year, aes(x = meanlaw)) + geom_histogram(binwidth = 10, col = "black") + labs(x = "Mean Number of Laws, 2014-2017", y = "Frequency")
It appears that half the states have around 15 or fewer laws on the books, with the average number of laws being around 25.
Is there a correlation between the number of laws in each state and their mean per capita casualty rate?
meanCBS <- arrange(meanCBS, state)
mean.laws.by.year <- arrange(mean.laws.by.year, State)
meanCBS <- mutate(meanCBS, numlaws = mean.laws.by.year$meanlaw)
qplot(x = numlaws, y = meanCasRate, data = meanCBS, xlab = "Number of Laws", ylab = "Mean Casualty Rate Per 100,000 People")
It doesn’t appear so. Some states have few laws on the books and have a lower casualty rate, whereas others achieve the low casualty rate with a higher than average number of laws. There are also a lot of states with a high casualty rate and few laws. It could be argued that the states with more laws would actually be more violent if the laws didn’t exist, but it is difficult to tease out that effect.
Let’s look at how each state ranks in terms of casualty rate and the number of gun laws:
g <- ggplot(data = meanCBS, aes(x = reorder(state, -meanCasRate), y = meanCasRate, fill = numlaws))
p2 <- g + geom_col(color = "gray") +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size = 14)) +
theme(axis.title.y = element_text(size = 16)) +
theme(plot.title = element_text(size = 16)) +
labs(x = NULL, y = "Mean Casualty Rate per 100,000 People",
title = "Mean Casualty Rate per 100,000 People by State (2014-2017)")
pcaswithlaw <- p2 + scale_fill_gradient2(low = muted("red"), mid = "white", high = muted("blue"), midpoint = max(meanCBS$numlaws)/2, name = "Avg. Number of Gun Laws", guide = guide_colorbar(direction = "horizontal", title.position = "top", barwidth = 10, title.hjust = .5)) +
theme(legend.background = element_rect(fill="gray90", size=.3, color = "black", linetype = 1)) +
theme(legend.justification=c(1,1), legend.position=c(1,1)) +
geom_hline(yintercept = median(meanCBS$meanCasRate), lwd = 1)
pcaswithlaw
The top 7 states with the highest average number of laws from 2014-2017 are California (103), Massachusetts (100), Connecticut (86.5), New York (75), New Jersey (68.8), and Illinois (64). Of these seven, two states are above the median casualty rate for the US (Illinois and Maryland) - the other five rank a little below the median rate. It appears there may be a weak association between number of laws and state casualty rate, but there are probably other factors at play as well, such as population density. There are a number of states below and above the median casualty rate with fewer gun laws, so it is difficult to conclude that more gun laws translate to less gun violence.
Let’s look at the data by city and try to figure out if there is a correlation between the number of laws in a particular state and the per capita casualty rates for cities in that state:
# First, remove DC since we don't have data on the number of gun laws there
meanCBClaws <- filter(meanCBC, state != "District of Columbia")
lawlist <- sapply(meanCBClaws$state, function(x) mean.laws.by.year[mean.laws.by.year$State == x,2])
meanCBClaws <- ungroup(meanCBClaws) %>% mutate("numlaws" = round(unlist(lawlist), digits = 1), meanCasRate = round(meanCasRate, digits = 1), meanKRate = round(meanKRate, digits = 1), meanIRate = round(meanIRate, digits = 1))
oldnames <- names(meanCBClaws) # Save the current column names for later
# Do some renaming to make things easier to understand
names(meanCBClaws) <- c("City", "state.long", "State", "Casualty Rate", "Fatality Rate", "Injury Rate", "Number of Laws")
p.allcities <- ggplot(data = meanCBClaws, aes(x = `Number of Laws`, y = `Casualty Rate`, color = State)) +
geom_point(aes(text = paste(City, ", ", State, "<br>Number of Laws: ", `Number of Laws`, "<br>Casualty Rate: ", `Casualty Rate`, "<br>Fatality Rate: ", `Fatality Rate`, "<br>Injury Rate: ", `Injury Rate`, sep = ""))) +
labs(x = "Number of Laws") +
labs (y = "Casualty Rate (Per 100,000 People)") +
theme(legend.position = "none")
ggplotly(p.allcities, tooltip = c("text"))
There are 699 cities displayed here (there are actually a lot more cities throughout the US where gun violence occurred, but the population data were only available for cities over 50,000 people). It is clear that in a given state, there is a wide distribution in the casualty rates. You can hover over each point to see the data it represents.
Let’s look at a box and whisker plot in order to see the distribution in city casualty rates in each state:
ggplot(data = meanCBClaws, aes(x = state.long, y = `Casualty Rate`, fill = `Number of Laws`)) +
geom_boxplot() +
labs(x = NULL) +
labs (y = "Casualty Rate (Per 100,000 People)") +
scale_fill_gradient2(midpoint = max(meanCBClaws$`Number of Laws`)/2, name = "Avg. Number of State Gun Laws", guide = guide_colorbar(direction = "horizontal", title.position = "top", barwidth = 10, title.hjust = .5)) +
theme(legend.background = element_rect(fill="gray90", size=.3, color = "black", linetype = 1)) +
theme(legend.justification=c(1,1), legend.position=c(1,1)) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size = 14)) +
theme(axis.title.y = element_text(size = 16)) +
theme(plot.title = element_text(size = 16))
SAY SOMETHING HERE
Now let’s take another look at the injury/fatality rates for the Top 49 cities with the highest rates of gun casualty, this time colored by number of state laws. These are the same cities as those in the Top 50 rank we did earlier, except we’ve excluded Washington, DC.
names(meanCBClaws) <- oldnames
# Take the top 49 cities and change the injury rate to negative for easy plotting
top49 <- meanCBClaws[1:49,] %>% arrange(meanCasRate) %>% mutate(meanIRate = -meanIRate)
top49melt <- melt(top49[,c("city_or_county", "abb", "meanKRate", "meanIRate", "numlaws")], id.vars = c("city_or_county", "abb", "numlaws"))
kiplot3 <- ggplot(data = top49melt) + geom_col(aes(x = city_or_county, y = value, fill = numlaws)) +
scale_y_continuous(breaks = c(50, 0, -50, -100, -150), labels = c("50", "0", "50", "100", "150")) +
theme(axis.text.x = element_text(hjust = .95, size = 12)) +
theme(axis.text.y = element_text(vjust = 0.5, size = 10)) +
theme(axis.title.y = element_text(size = 14)) +
theme(plot.title = element_text(size = 16)) +
labs(x = NULL, y = NULL) +
labs(title = "Mean Injury/Fatality Rate per 100,000 People by City (2014-2017 where available)") +
scale_x_discrete(limits = top49$city_or_county, labels = paste(top49$city_or_county, sep = ", ", top49$abb)) +
geom_hline(yintercept = 0, lwd = 2, color = "black") +
coord_flip()
citycaswithlaw <- kiplot3 + scale_fill_gradient2(midpoint = max(top49$numlaws)/2, name = "Avg. Number of State Gun Laws", guide = guide_colorbar(direction = "horizontal", title.position = "top", barwidth = 10, title.hjust = .5)) +
theme(legend.background = element_rect(fill="gray90", size=.3, color = "black", linetype = 1)) +
theme(legend.justification=c(.125,.125), legend.position=c(.125,.125)) +
annotate("label", x = 25, y = -100, label = "Injuries", size = 6) +
annotate("label", x = 25, y = 45, label = "Fatalities", size = 6)
citycaswithlaw
Limitations: Individual city laws are not considered. For example, New York City has an average casualty rate of 0.02 per hundred thousand people (effectively 0) for 2014-2017! It is one of the SAFEST and MOST POPULOUS cities in terms of gun violence in the United States. It also has some of the strictest laws in the US. Clearly, the laws in New York City are working to curb gun violence (although overall crime is not considered here).
(ACTUALLY - THE DATA FOR NEW YORK ARE NOT QUITE RIGHT; INDIVIDUAL AREAS LIKE QUEENS AND BRONX ARE LISTED AS DIFFERENT CITIES, BUT THE CITY POPULATION DATA MAY COVER THOSE CITIES)